knitr::opts_chunk$set(
  echo = TRUE,
  error = FALSE,
  comment = "#>",
  fig.path = "img/",
  fig.retina = 2,
  fig.width = 10,
  fig.asp = 3/4, 
  fig.height = 8,
  fig.pos = "t",
  fig.align = "center",
  dpi = 150,
  out.width = "90%",
  dev.args = list(png = list(type = "cairo-png")),
  optipng = "-o1 -quiet"
)

1 cluster+rank+predict

1.0.0.1 Loading Required Packages

library(tidyverse)
#> ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
#> ✔ dplyr     1.1.2     ✔ readr     2.1.4
#> ✔ forcats   1.0.0     ✔ stringr   1.5.0
#> ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
#> ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
#> ✔ purrr     1.0.1     
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ dplyr::filter() masks stats::filter()
#> ✖ dplyr::lag()    masks stats::lag()
#> ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(factoextra)
#> Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(scorecard)
#> 
#> Attaching package: 'scorecard'
#> 
#> The following object is masked from 'package:tidyr':
#> 
#>     replace_na
library(glmnet)
#> Loading required package: Matrix
#> 
#> Attaching package: 'Matrix'
#> 
#> The following objects are masked from 'package:tidyr':
#> 
#>     expand, pack, unpack
#> 
#> Loaded glmnet 4.1-8
library(ggplot2)
library(plotly)
#> 
#> Attaching package: 'plotly'
#> 
#> The following object is masked from 'package:ggplot2':
#> 
#>     last_plot
#> 
#> The following object is masked from 'package:stats':
#> 
#>     filter
#> 
#> The following object is masked from 'package:graphics':
#> 
#>     layout
library(dplyr)
library(knitr)
library(gridExtra)
#> 
#> Attaching package: 'gridExtra'
#> 
#> The following object is masked from 'package:dplyr':
#> 
#>     combine
library(grid)
library(cluster)
library(factoextra)
library(modeest)
#> Registered S3 method overwritten by 'rmutil':
#>   method         from
#>   print.response httr

1.1 part1_Import Data

# Load the data
train_data <- read.csv("C:/Users/lenovo/Downloads/train.csv")

# Preview the data
str(train_data)
#> 'data.frame':    81738 obs. of  21 variables:
#>  $ loan_amnt            : int  5000 2400 5000 3000 5600 5375 6500 9000 3000 10000 ...
#>  $ funded_amnt          : int  5000 2400 5000 3000 5600 5375 6500 9000 3000 10000 ...
#>  $ pymnt_plan           : chr  "n" "n" "n" "n" ...
#>  $ grade                : chr  "B" "C" "A" "E" ...
#>  $ sub_grade_num        : num  0.4 1 0.8 0.2 0.4 1 0.6 0.2 0.2 0.4 ...
#>  $ short_emp            : int  0 0 0 0 0 1 0 1 0 0 ...
#>  $ emp_length_num       : int  11 11 4 10 5 1 6 1 4 4 ...
#>  $ home_ownership       : chr  "RENT" "RENT" "RENT" "RENT" ...
#>  $ dti                  : num  27.65 8.72 11.2 5.35 5.55 ...
#>  $ purpose              : chr  "credit_card" "small_business" "wedding" "car" ...
#>  $ payment_inc_ratio    : num  8.14 8.26 5.22 2.74 4.57 ...
#>  $ delinq_2yrs          : int  0 0 0 0 0 0 0 0 0 0 ...
#>  $ delinq_2yrs_zero     : int  1 1 1 1 1 1 1 1 1 1 ...
#>  $ inq_last_6mths       : int  1 2 3 2 2 0 2 1 2 2 ...
#>  $ last_delinq_none     : int  1 1 1 1 1 1 1 1 1 1 ...
#>  $ last_major_derog_none: int  1 1 1 1 1 1 1 1 1 1 ...
#>  $ open_acc             : int  3 2 9 4 11 2 14 4 11 14 ...
#>  $ pub_rec              : int  0 0 0 0 0 0 0 0 0 0 ...
#>  $ pub_rec_zero         : int  1 1 1 1 1 1 1 1 1 1 ...
#>  $ revol_util           : num  83.7 98.5 28.3 87.5 32.6 36.5 20.6 91.7 43.1 55.5 ...
#>  $ bad_loans            : int  0 0 0 0 1 1 0 1 0 1 ...
purpose_counts <- table(train_data$purpose)
print(purpose_counts)
#> 
#>                car        credit_card debt_consolidation   home_improvement 
#>               1570              14722              45428               4990 
#>              house     major_purchase            medical             moving 
#>                665               2580               1085                792 
#>              other     small_business           vacation            wedding 
#>               6107               2173                587               1039
home_ownership_counts <- table(train_data$home_ownership)
print(home_ownership_counts)
#> 
#> MORTGAGE    OTHER      OWN     RENT 
#>    39583      116     6639    35400
grade_counts <- table(train_data$grade)
print(grade_counts)
#> 
#>     A     B     C     D     E     F     G 
#> 14812 24775 19928 12847  6022  2611   743
# Check for missing values and duplicates
sum(is.na(train_data))
#> [1] 97
# Convert categorical data to numeric
train_data <- train_data %>%
  mutate(
    grade = as.numeric(factor(grade), levels = c("A", "B", "C", "D", "E", "F" , "G")),
    purpose = as.numeric(factor(purpose), levels = c("car", "credit_card", "debt_consolidation", "home_improvement", "house", "major_purchase","medical","moving", "other", "small_business", "vacation", "wedding")),
    home_ownership = as.numeric(factor(home_ownership), levels = c("MORTGAGE", "OTHER", "OWN", "RENT"))
  )

# Remove unnecessary columns
train_data <- dplyr::select(train_data, -pymnt_plan)

# Check for missing values and handle them
na_counts <- sapply(train_data, function(x) sum(is.na(x)))
print(na_counts)
#>             loan_amnt           funded_amnt                 grade 
#>                     0                     0                     0 
#>         sub_grade_num             short_emp        emp_length_num 
#>                     0                     0                     0 
#>        home_ownership                   dti               purpose 
#>                     0                     0                     0 
#>     payment_inc_ratio           delinq_2yrs      delinq_2yrs_zero 
#>                     1                    16                    16 
#>        inq_last_6mths      last_delinq_none last_major_derog_none 
#>                    16                     0                     0 
#>              open_acc               pub_rec          pub_rec_zero 
#>                    16                    16                    16 
#>            revol_util             bad_loans 
#>                     0                     0
set.seed(1)
total_rows <- nrow(train_data)
subset_size <- total_rows / 2
random_subset <- train_data %>%
  sample_n(subset_size)
cat("Total rows:", total_rows, "\n")
#> Total rows: 81738
cat("Subset size:", subset_size, "\n")
#> Subset size: 40869
print(head(random_subset))
#>   loan_amnt funded_amnt grade sub_grade_num short_emp emp_length_num
#> 1     12000       12000     3           1.0         0              4
#> 2     18550       18550     4           0.4         0              5
#> 3     10000       10000     4           0.6         0              5
#> 4      9600        9600     1           0.4         1              0
#> 5     12000       12000     1           0.4         0             11
#> 6      6000        6000     5           0.4         0              6
#>   home_ownership   dti purpose payment_inc_ratio delinq_2yrs delinq_2yrs_zero
#> 1              4  9.14       3          11.76200           0                1
#> 2              4 18.64       7           8.77438           0                1
#> 3              1 10.87       2           6.54269           0                1
#> 4              4 19.14       3           5.89520           1                0
#> 5              1  4.20      10           6.23880           0                1
#> 6              4 19.92       3           4.27303           0                1
#>   inq_last_6mths last_delinq_none last_major_derog_none open_acc pub_rec
#> 1              5                0                     1       15       1
#> 2              1                1                     1        4       0
#> 3              3                1                     1        8       0
#> 4              1                0                     1       13       0
#> 5              0                1                     1       14       0
#> 6              6                0                     1       16       0
#>   pub_rec_zero revol_util bad_loans
#> 1            0       63.5         0
#> 2            1       47.1         0
#> 3            1       50.8         0
#> 4            1       23.7         0
#> 5            1        6.0         0
#> 6            1       45.1         0
# dataframe
train_data1 <- random_subset

1.2 part2_Iv

iv = iv(train_data1, y = 'bad_loans') %>%
  as_tibble() %>%
  mutate( info_value = round(info_value, 3) ) %>%
  arrange( desc(info_value) )

iv %>%
  knitr::kable()
variable info_value
dti 0.485
grade 0.329
revol_util 0.252
loan_amnt 0.236
funded_amnt 0.234
payment_inc_ratio 0.081
purpose 0.042
inq_last_6mths 0.041
home_ownership 0.017
emp_length_num 0.010
open_acc 0.008
short_emp 0.004
delinq_2yrs 0.003
pub_rec 0.001
delinq_2yrs_zero 0.001
sub_grade_num 0.001
last_delinq_none 0.000
last_major_derog_none 0.000
pub_rec_zero 0.000

1.3 part3_WOE

bins = woebin(train_data1, y = 'bad_loans')
#> ℹ Creating woe binning ...
#> ✔ Binning on 40869 rows and 20 columns in 00:00:12

1.3.1 woe_plot

####plot-bins
variables <- names(bins[])

for (var in variables) {
  # Print the table for each variable
  bins[[var]] %>%
    knitr::kable()
  
  # Plot with specified colors
  plot <- woebin_plot(bins[[var]], line_color = 'grey4', bar_color = c('steelblue3', 'sandybrown'), show_barval = FALSE)
  
  # Print the plot
  print(plot)
}
#> $loan_amnt

#> 
#> $funded_amnt

#> 
#> $grade

#> 
#> $sub_grade_num

#> 
#> $short_emp

#> 
#> $emp_length_num

#> 
#> $home_ownership

#> 
#> $dti

#> 
#> $purpose

#> 
#> $payment_inc_ratio

#> 
#> $delinq_2yrs

#> 
#> $delinq_2yrs_zero

#> 
#> $inq_last_6mths

#> 
#> $last_delinq_none

#> 
#> $last_major_derog_none

#> 
#> $open_acc

#> 
#> $pub_rec

#> 
#> $pub_rec_zero

#> 
#> $revol_util

plots <- list()
for (var in variables[c(1:7, 10, 12)]) {
  if (var %in% names(bins)) {
    plot <- woebin_plot(bins[[var]], line_color = 'grey4', bar_color = c('steelblue3', 'sandybrown'), show_barval = FALSE)
    plots[[var]] <- plot[[1]]  
  } else {
    cat("\n### Variable:", var, "not found in bins\n")
  }
}

do.call(grid.arrange, c(plots, ncol = 3))

1.3.2 woe_table

# Define a function to add percentage columns
add_percentage_columns <- function(bin_data) {
  bin_data %>%
    mutate(
      percentage_pos = (pos / count) * 100,
      percentage_neg = (neg / count) * 100
    )
}
# Apply the function to each bin in the list
for (var in variables) {
  if (var %in% names(bins)) {
    bins[[var]] <- add_percentage_columns(bins[[var]])
    cat("\n### Added percentage columns to variable:", var, "\n")
  } else {
    cat("\n### Variable:", var, "not found in bins\n")
  }
}
#> 
#> ### Added percentage columns to variable: loan_amnt 
#> 
#> ### Added percentage columns to variable: funded_amnt 
#> 
#> ### Added percentage columns to variable: grade 
#> 
#> ### Added percentage columns to variable: sub_grade_num 
#> 
#> ### Added percentage columns to variable: short_emp 
#> 
#> ### Added percentage columns to variable: emp_length_num 
#> 
#> ### Added percentage columns to variable: home_ownership 
#> 
#> ### Added percentage columns to variable: dti 
#> 
#> ### Added percentage columns to variable: purpose 
#> 
#> ### Added percentage columns to variable: payment_inc_ratio 
#> 
#> ### Added percentage columns to variable: delinq_2yrs 
#> 
#> ### Added percentage columns to variable: delinq_2yrs_zero 
#> 
#> ### Added percentage columns to variable: inq_last_6mths 
#> 
#> ### Added percentage columns to variable: last_delinq_none 
#> 
#> ### Added percentage columns to variable: last_major_derog_none 
#> 
#> ### Added percentage columns to variable: open_acc 
#> 
#> ### Added percentage columns to variable: pub_rec 
#> 
#> ### Added percentage columns to variable: pub_rec_zero 
#> 
#> ### Added percentage columns to variable: revol_util
######################################
for (var in variables) {
  if (var %in% names(bins)) {
    table <- bins[[var]] %>%
      dplyr::select(variable, bin, woe, count, percentage_pos, percentage_neg) %>%
      dplyr::rename(
        Variable = variable,
        Bin = bin,
        WOE = woe,
        Count = count,
        `Positive Rate (%)` = percentage_pos,
        `Negative Rate (%)` = percentage_neg
      )
    
    cat("\n### Variable:", var, "\n")
    print(knitr::kable(table, format = "pipe", digits = 2))
  } else {
    cat("\n### Variable:", var, "not found in bins\n")
  }
}
#> 
#> ### Variable: loan_amnt 
#> 
#> 
#> |Variable  |Bin          |   WOE| Count| Positive Rate (%)| Negative Rate (%)|
#> |:---------|:------------|-----:|-----:|-----------------:|-----------------:|
#> |loan_amnt |[-Inf,8000)  | -0.18| 12535|             16.51|             83.49|
#> |loan_amnt |[8000,15500) | -0.07| 15915|             18.09|             81.91|
#> |loan_amnt |[15500, Inf) |  0.24| 12419|             23.05|             76.95|
#> 
#> ### Variable: funded_amnt 
#> 
#> 
#> |Variable    |Bin          |   WOE| Count| Positive Rate (%)| Negative Rate (%)|
#> |:-----------|:------------|-----:|-----:|-----------------:|-----------------:|
#> |funded_amnt |[-Inf,8000)  | -0.17| 12618|             16.56|             83.44|
#> |funded_amnt |[8000,15500) | -0.07| 15990|             18.11|             81.89|
#> |funded_amnt |[15500, Inf) |  0.24| 12261|             23.04|             76.96|
#> 
#> ### Variable: grade 
#> 
#> 
#> |Variable |Bin      |   WOE| Count| Positive Rate (%)| Negative Rate (%)|
#> |:--------|:--------|-----:|-----:|-----------------:|-----------------:|
#> |grade    |[-Inf,2) | -1.12|  7309|              7.14|             92.86|
#> |grade    |[2,3)    | -0.32| 12370|             14.66|             85.34|
#> |grade    |[3,4)    |  0.10| 10069|             20.77|             79.23|
#> |grade    |[4,5)    |  0.42|  6414|             26.47|             73.53|
#> |grade    |[5, Inf) |  0.86|  4707|             35.84|             64.16|
#> 
#> ### Variable: sub_grade_num 
#> 
#> 
#> |Variable      |Bin        |   WOE| Count| Positive Rate (%)| Negative Rate (%)|
#> |:-------------|:----------|-----:|-----:|-----------------:|-----------------:|
#> |sub_grade_num |[-Inf,0.4) | -0.05|  7966|             18.40|             81.60|
#> |sub_grade_num |[0.4,0.6)  | -0.01|  8283|             18.97|             81.03|
#> |sub_grade_num |[0.6,0.8)  |  0.00|  8399|             19.17|             80.83|
#> |sub_grade_num |[0.8, Inf) |  0.03| 16221|             19.51|             80.49|
#> 
#> ### Variable: short_emp 
#> 
#> 
#> |Variable  |Bin      |   WOE| Count| Positive Rate (%)| Negative Rate (%)|
#> |:---------|:--------|-----:|-----:|-----------------:|-----------------:|
#> |short_emp |[-Inf,1) | -0.02| 35773|             18.74|             81.26|
#> |short_emp |[1, Inf) |  0.16|  5096|             21.74|             78.26|
#> 
#> ### Variable: emp_length_num 
#> 
#> 
#> |Variable       |Bin      |   WOE| Count| Positive Rate (%)| Negative Rate (%)|
#> |:--------------|:--------|-----:|-----:|-----------------:|-----------------:|
#> |emp_length_num |[-Inf,2) |  0.16|  5096|             21.74|             78.26|
#> |emp_length_num |[2,3)    |  0.03|  2811|             19.53|             80.47|
#> |emp_length_num |[3, Inf) | -0.03| 32962|             18.67|             81.33|
#> 
#> ### Variable: home_ownership 
#> 
#> 
#> |Variable       |Bin      |   WOE| Count| Positive Rate (%)| Negative Rate (%)|
#> |:--------------|:--------|-----:|-----:|-----------------:|-----------------:|
#> |home_ownership |[-Inf,2) | -0.14| 19843|             17.08|             82.92|
#> |home_ownership |[2,4)    |  0.08|  3373|             20.43|             79.57|
#> |home_ownership |[4, Inf) |  0.13| 17653|             21.15|             78.85|
#> 
#> ### Variable: dti 
#> 
#> 
#> |Variable |Bin       |   WOE| Count| Positive Rate (%)| Negative Rate (%)|
#> |:--------|:---------|-----:|-----:|-----------------:|-----------------:|
#> |dti      |[-Inf,7)  | -0.37|  5650|             14.07|             85.93|
#> |dti      |[7,17)    | -0.14| 18037|             17.03|             82.97|
#> |dti      |[17,26)   |  0.14| 13550|             21.37|             78.63|
#> |dti      |[26, Inf) |  0.54|  3632|             28.91|             71.09|
#> 
#> ### Variable: purpose 
#> 
#> 
#> |Variable |Bin      |   WOE| Count| Positive Rate (%)| Negative Rate (%)|
#> |:--------|:--------|-----:|-----:|-----------------:|-----------------:|
#> |purpose  |[-Inf,3) | -0.20|  8128|             16.25|             83.75|
#> |purpose  |[3,4)    |  0.03| 22724|             19.54|             80.46|
#> |purpose  |[4,6)    | -0.16|  2782|             16.75|             83.25|
#> |purpose  |[6,9)    | -0.20|  2270|             16.26|             83.74|
#> |purpose  |[9, Inf) |  0.32|  4965|             24.47|             75.53|
#> 
#> ### Variable: payment_inc_ratio 
#> 
#> 
#> |Variable          |Bin        |   WOE| Count| Positive Rate (%)| Negative Rate (%)|
#> |:-----------------|:----------|-----:|-----:|-----------------:|-----------------:|
#> |payment_inc_ratio |[-Inf,6.5) | -0.35| 18679|             14.21|             85.79|
#> |payment_inc_ratio |[6.5,10)   | -0.03| 11456|             18.60|             81.40|
#> |payment_inc_ratio |[10,12)    |  0.32|  4357|             24.51|             75.49|
#> |payment_inc_ratio |[12, Inf)  |  0.63|  6377|             30.69|             69.31|
#> 
#> ### Variable: delinq_2yrs 
#> 
#> 
#> |Variable    |Bin                |   WOE| Count| Positive Rate (%)| Negative Rate (%)|
#> |:-----------|:------------------|-----:|-----:|-----------------:|-----------------:|
#> |delinq_2yrs |[-Inf,1)%,%missing | -0.01| 35079|             18.95|             81.05|
#> |delinq_2yrs |[1, Inf)           |  0.06|  5790|             20.12|             79.88|
#> 
#> ### Variable: delinq_2yrs_zero 
#> 
#> 
#> |Variable         |Bin                |   WOE| Count| Positive Rate (%)| Negative Rate (%)|
#> |:----------------|:------------------|-----:|-----:|-----------------:|-----------------:|
#> |delinq_2yrs_zero |[-Inf,1)%,%missing |  0.06|  5796|             20.10|             79.90|
#> |delinq_2yrs_zero |[1, Inf)           | -0.01| 35073|             18.95|             81.05|
#> 
#> ### Variable: inq_last_6mths 
#> 
#> 
#> |Variable       |Bin                |   WOE| Count| Positive Rate (%)| Negative Rate (%)|
#> |:--------------|:------------------|-----:|-----:|-----------------:|-----------------:|
#> |inq_last_6mths |[-Inf,1)%,%missing | -0.18| 18623|             16.42|             83.58|
#> |inq_last_6mths |[1,2)              |  0.04| 11548|             19.67|             80.33|
#> |inq_last_6mths |[2,3)              |  0.14|  6114|             21.43|             78.57|
#> |inq_last_6mths |[3, Inf)           |  0.37|  4584|             25.57|             74.43|
#> 
#> ### Variable: last_delinq_none 
#> 
#> 
#> |Variable         |Bin      |   WOE| Count| Positive Rate (%)| Negative Rate (%)|
#> |:----------------|:--------|-----:|-----:|-----------------:|-----------------:|
#> |last_delinq_none |[-Inf,1) |  0.01| 16910|             19.32|             80.68|
#> |last_delinq_none |[1, Inf) | -0.01| 23959|             18.97|             81.03|
#> 
#> ### Variable: last_major_derog_none 
#> 
#> 
#> |Variable              |Bin      |   WOE| Count| Positive Rate (%)| Negative Rate (%)|
#> |:---------------------|:--------|-----:|-----:|-----------------:|-----------------:|
#> |last_major_derog_none |[-Inf,1) | -0.03|  5181|             18.70|             81.30|
#> |last_major_derog_none |[1, Inf) |  0.00| 35688|             19.17|             80.83|
#> 
#> ### Variable: open_acc 
#> 
#> 
#> |Variable |Bin                |   WOE| Count| Positive Rate (%)| Negative Rate (%)|
#> |:--------|:------------------|-----:|-----:|-----------------:|-----------------:|
#> |open_acc |[-Inf,5)%,%missing |  0.06|  2591|             19.99|             80.01|
#> |open_acc |[5,7)              | -0.05|  5434|             18.42|             81.58|
#> |open_acc |[7,10)             |  0.00| 11422|             19.06|             80.94|
#> |open_acc |[10,12)            |  0.03|  6942|             19.56|             80.44|
#> |open_acc |[12,13)            |  0.09|  2893|             20.57|             79.43|
#> |open_acc |[13,14)            | -0.06|  2378|             18.17|             81.83|
#> |open_acc |[14,16)            |  0.03|  3601|             19.58|             80.42|
#> |open_acc |[16, Inf)          | -0.05|  5608|             18.28|             81.72|
#> 
#> ### Variable: pub_rec 
#> 
#> 
#> |Variable |Bin                |   WOE| Count| Positive Rate (%)| Negative Rate (%)|
#> |:--------|:------------------|-----:|-----:|-----------------:|-----------------:|
#> |pub_rec  |[-Inf,1)%,%missing |  0.00| 37093|             19.14|             80.86|
#> |pub_rec  |[1, Inf)           | -0.02|  3776|             18.80|             81.20|
#> 
#> ### Variable: pub_rec_zero 
#> 
#> 
#> |Variable     |Bin                |   WOE| Count| Positive Rate (%)| Negative Rate (%)|
#> |:------------|:------------------|-----:|-----:|-----------------:|-----------------:|
#> |pub_rec_zero |[-Inf,1)%,%missing | -0.02|  3782|             18.77|             81.23|
#> |pub_rec_zero |[1, Inf)           |  0.00| 37087|             19.15|             80.85|
#> 
#> ### Variable: revol_util 
#> 
#> 
#> |Variable   |Bin       |   WOE| Count| Positive Rate (%)| Negative Rate (%)|
#> |:----------|:---------|-----:|-----:|-----------------:|-----------------:|
#> |revol_util |[-Inf,26) | -0.55|  6902|             12.03|             87.97|
#> |revol_util |[26,50)   | -0.18| 10387|             16.45|             83.55|
#> |revol_util |[50,72)   |  0.07| 12001|             20.21|             79.79|
#> |revol_util |[72,86)   |  0.25|  6892|             23.35|             76.65|
#> |revol_util |[86, Inf) |  0.42|  4687|             26.41|             73.59|
results_list <- list()
for (var in variables) {
  if (var %in% names(bins)) {
    table <- bins[[var]] %>%
      dplyr::select(variable, bin, woe, count, percentage_pos, percentage_neg, bin_iv) %>%
      dplyr::rename(
        Variable = variable,
        Bin = bin,
        WOE = woe,
        Count = count,
        iv = bin_iv,
        `Positive Rate (%)` = percentage_pos,
        `Negative Rate (%)` = percentage_neg
      )
    
    results_list[[var]] <- table
  }
}

combined_results <- bind_rows(results_list, .id = "Variable")

1.4 part4_feature_selection

1.4.1 PCA

train_woe <- woebin_ply(train_data1, bins)
#> ℹ Converting into woe values ...
#> ✔ Woe transformating on 40869 rows and 19 columns in 00:00:11
df_train <- train_woe

# Remove unnecessary columns
df1 <- dplyr::select(df_train, -bad_loans)

# Perform PCA
data_pca <- prcomp(df1, center = TRUE, scale. = F)
summary(data_pca)
#> Importance of components:
#>                           PC1    PC2    PC3     PC4     PC5     PC6     PC7
#> Standard deviation     0.6260 0.3721 0.2820 0.22881 0.20052 0.17533 0.15299
#> Proportion of Variance 0.5004 0.1768 0.1016 0.06685 0.05135 0.03926 0.02989
#> Cumulative Proportion  0.5004 0.6772 0.7788 0.84562 0.89696 0.93622 0.96610
#>                            PC8     PC9    PC10    PC11    PC12    PC13    PC14
#> Standard deviation     0.12092 0.08631 0.04382 0.03642 0.02566 0.01404 0.01132
#> Proportion of Variance 0.01867 0.00951 0.00245 0.00169 0.00084 0.00025 0.00016
#> Cumulative Proportion  0.98478 0.99429 0.99674 0.99843 0.99927 0.99953 0.99969
#>                            PC15     PC16     PC17      PC18      PC19
#> Standard deviation     0.009821 0.009364 0.007662 0.0006635 2.419e-16
#> Proportion of Variance 0.000120 0.000110 0.000070 0.0000000 0.000e+00
#> Cumulative Proportion  0.999810 0.999920 1.000000 1.0000000 1.000e+00
loadings <- data_pca$rotation
head(loadings)
#>                             PC1          PC2          PC3          PC4
#> loan_amnt_woe       0.073925136 -0.253820895  0.063586182 -0.198899745
#> funded_amnt_woe     0.073912131 -0.251889280  0.062201727 -0.195292233
#> grade_woe           0.945146341  0.221287928  0.199161724  0.035117355
#> sub_grade_num_woe  -0.005563003 -0.005599729 -0.009461251 -0.002620156
#> short_emp_woe      -0.002213074 -0.001496193  0.005141298  0.004143282
#> emp_length_num_woe -0.002408340 -0.001046150  0.006117357  0.004535543
#>                             PC5          PC6          PC7         PC8
#> loan_amnt_woe      -0.596521308  0.083046755 -0.070652246  0.16337237
#> funded_amnt_woe    -0.585355665  0.082238273 -0.068350612  0.16071107
#> grade_woe           0.032423462  0.105600793  0.057545178 -0.03161873
#> sub_grade_num_woe  -0.004397023 -0.013970272 -0.011040741  0.01266304
#> short_emp_woe       0.044973686 -0.008747212 -0.003109089  0.06880736
#> emp_length_num_woe  0.049823527 -0.009151991 -0.003684869  0.07775236
#>                             PC9          PC10         PC11          PC12
#> loan_amnt_woe      -0.024335682  0.0044949026 -0.003463462 -0.0037211962
#> funded_amnt_woe    -0.024055461  0.0030525030  0.001655281  0.0140260871
#> grade_woe          -0.004505497 -0.0008711838 -0.011588331 -0.0116795853
#> sub_grade_num_woe   0.001316874  0.0092330898  0.056617537 -0.9974973244
#> short_emp_woe      -0.693738384  0.0058779791  0.001623484  0.0007030922
#> emp_length_num_woe -0.709286624  0.0054178875  0.003796025 -0.0005814363
#>                             PC13         PC14          PC15         PC16
#> loan_amnt_woe       0.6992147436 -0.049363944 -0.0101610460  0.002800952
#> funded_amnt_woe    -0.7113296330  0.047644996  0.0111328205  0.001980115
#> grade_woe           0.0006441983  0.004242974  0.0001074442 -0.002173667
#> sub_grade_num_woe  -0.0106377005  0.025865070  0.0001360339 -0.014421276
#> short_emp_woe      -0.0152838291 -0.074032288 -0.6886503038 -0.178334445
#> emp_length_num_woe  0.0148283926  0.067794163  0.6727614217  0.175270894
#>                            PC17          PC18          PC19
#> loan_amnt_woe      -0.001259353  4.512122e-05  2.014199e-15
#> funded_amnt_woe    -0.001106719 -4.082085e-05 -7.996344e-16
#> grade_woe           0.001585032  8.722092e-06  9.584531e-17
#> sub_grade_num_woe   0.007808105  1.586855e-05  3.545324e-16
#> short_emp_woe      -0.006202354 -3.354520e-04  4.690120e-16
#> emp_length_num_woe  0.005881672  6.461365e-05 -5.546858e-16
fviz_eig(data_pca, addlabels = TRUE)

# Check the PCA output
head(data_pca$x)
#>             PC1         PC2       PC3         PC4          PC5          PC6
#> [1,]  0.2921316 -0.19089166 0.1057117 -0.05820084  0.219797540 -0.412702689
#> [2,]  0.5134660 -0.03691616 0.2125867  0.11715191 -0.304435505  0.095277648
#> [3,]  0.5207891  0.18552810 0.1064171 -0.05423267  0.007266799 -0.377334274
#> [4,] -1.1263020  0.07704632 0.1638929  0.32977103 -0.098309680  0.007099912
#> [5,] -1.1771336  0.15374169 0.3223907 -0.14675212 -0.013589018  0.164693871
#> [6,]  0.8368147  0.58757265 0.2756430  0.36246019  0.045630945 -0.191133919
#>              PC7         PC8         PC9         PC10         PC11
#> [1,] -0.05091001  0.09821871  0.06706506  0.028341563 -0.005671512
#> [2,]  0.18346330  0.20878863  0.03659958  0.059627400 -0.022679101
#> [3,]  0.19399641 -0.10711691  0.02778540 -0.003411623 -0.020516487
#> [4,] -0.06360999  0.19311549 -0.21201255 -0.055867561  0.097186839
#> [5,] -0.31250876 -0.15023069  0.02451762  0.028734360 -0.016472905
#> [6,]  0.01093339  0.11298191  0.05369666 -0.053209611 -0.023127403
#>               PC12          PC13         PC14         PC15         PC16
#> [1,] -0.0210751094 -0.0009599073 -0.012285314 -0.008755016  0.023993526
#> [2,]  0.0045274794 -0.0013699203  0.008656917 -0.003202198 -0.002783650
#> [3,] -0.0077617866  0.0004323956  0.009952554 -0.002231300 -0.004361317
#> [4,]  0.0209509095 -0.0005126908  0.003215737 -0.003733679 -0.001526565
#> [5,]  0.0111541083 -0.0014094692  0.004725085 -0.002400555 -0.001721166
#> [6,] -0.0002443128 -0.0013473640 -0.003498016 -0.001031571 -0.009234172
#>              PC17          PC18          PC19
#> [1,] -0.017875967 -2.174590e-05  2.755393e-16
#> [2,]  0.003341090  1.209072e-05  3.543329e-16
#> [3,]  0.004376651  3.914620e-05 -5.839496e-17
#> [4,] -0.007516465 -6.584101e-05 -2.862365e-16
#> [5,]  0.004050195 -5.141151e-05 -1.828037e-16
#> [6,] -0.011053840  4.810600e-05 -1.036341e-16
eig.val<-get_eigenvalue(data_pca)
eig.val
#>          eigenvalue variance.percent cumulative.variance.percent
#> Dim.1  3.918482e-01     5.003707e+01                    50.03707
#> Dim.2  1.384765e-01     1.768276e+01                    67.71983
#> Dim.3  7.953661e-02     1.015643e+01                    77.87626
#> Dim.4  5.235333e-02     6.685260e+00                    84.56152
#> Dim.5  4.020963e-02     5.134570e+00                    89.69609
#> Dim.6  3.074142e-02     3.925526e+00                    93.62162
#> Dim.7  2.340642e-02     2.988883e+00                    96.61050
#> Dim.8  1.462112e-02     1.867045e+00                    98.47754
#> Dim.9  7.449615e-03     9.512788e-01                    99.42882
#> Dim.10 1.919864e-03     2.451571e-01                    99.67398
#> Dim.11 1.326062e-03     1.693315e-01                    99.84331
#> Dim.12 6.586397e-04     8.410501e-02                    99.92742
#> Dim.13 1.970018e-04     2.515616e-02                    99.95257
#> Dim.14 1.281299e-04     1.636155e-02                    99.96893
#> Dim.15 9.645838e-05     1.231726e-02                    99.98125
#> Dim.16 8.767756e-05     1.119599e-02                    99.99245
#> Dim.17 5.870826e-05     7.496753e-03                    99.99994
#> Dim.18 4.402401e-07     5.621647e-05                   100.00000
#> Dim.19 5.852888e-32     7.473848e-30                   100.00000
# Determine the optimal number of clusters using the Elbow method on PCA components
data_pca_final<-prcomp(df1, center=FALSE, scale.=FALSE, rank. = 3)
results <- data_pca_final$x

1.4.1.1 PCA_plot

var<-get_pca_var(data_pca)
a<-fviz_contrib(data_pca, "var", axes=1, xtickslab.rt=90) # default angle=45°
plot(a,main = "Variables percentage contribution of first Principal Components")

b<-fviz_contrib(data_pca, "var", axes=2, xtickslab.rt=90) # default angle=45°
plot(a,main = "Variables percentage contribution of second Principal Components")

c<-fviz_contrib(data_pca, "var", axes=3, xtickslab.rt=90) # default angle=45°
plot(a,main = "Variables percentage contribution of second Principal Components")

d<-fviz_contrib(data_pca, "var", axes=4, xtickslab.rt=90) # default angle=45°
plot(a,main = "Variables percentage contribution of second Principal Components")

library("corrplot")
#> corrplot 0.92 loaded
corrplot(var$cos2, is.corr=FALSE)

fviz_pca_var(data_pca,
             col.var = "cos2", # Color by the quality of representation
             gradient.cols = c("darkorchid4", "gold", "darkorange"),
             repel = TRUE
)

######################################
### Rename columns to f1, f2, f3, ...
df1_new <- df1
new_col_names <- paste0("f", seq_along(colnames(df1_new)))
names(df1_new) <- new_col_names

# Create a mapping table of original and new column names
original_col_names <- colnames(dplyr::select(train_woe, -bad_loans))

data_pca1 <- prcomp(df1_new, center = TRUE, scale. = F)
loadings <- data_pca1$rotation

# Create a mapping table for reference
mapping_table <- data.frame(
  New_Name = new_col_names,
  Original_Name = colnames(dplyr::select(train_woe, -bad_loans))
)

fviz_pca_var(data_pca1,
             col.var = "cos2", # Color by the quality of representation
             gradient.cols = c("darkorchid4", "gold", "darkorange"),
             repel = TRUE
)

# Determine the optimal number of clusters using the Elbow method on PCA components
data_pca_final<-prcomp(df1, center=FALSE, scale.=FALSE, rank. = 3)
results <- data_pca_final$x

1.5 part5_Clustering

1.5.1 K-means on all variable

set.seed(1)
wss2 <- function(k) {
  kmeans(df1, centers = k, iter.max = 200, nstart = 100)$tot.withinss
}
k.values <- 1:10
wss_values2 <- sapply(k.values, wss2)

# Elbow method
plot(k.values, wss_values2, type = 'b', xlab = 'Number of Clusters', ylab = 'Total Within-Cluster Sum of Squares', main = 'Elbow Method')

# Perform k-means clustering with 4 cluster
km4 <- kmeans(df1, centers = 4, nstart =100)
fviz_cluster(km4, data = df1) +
  scale_color_manual(values = c('steelblue3', 'sandybrown', '#9CDB9E', 'lightpink4')) +
  scale_fill_manual(values = c('steelblue3', 'sandybrown', '#9CDB9E', 'lightpink4')) +
  ggtitle("Cluster for All Variables") +
  theme_minimal()

1.5.2 K-means on all result of pca

set.seed(2)
wss1 <- function(k) {
  kmeans(results, centers = k, iter.max = 100, nstart = 50)$tot.withinss
}
k.values <- 1:10
wss_values1 <- sapply(k.values, wss1)
wss_values1
#>  [1] 24923.572 14400.046 10627.189  8140.346  6908.100  5934.548  5332.015
#>  [8]  4739.680  4306.678  3945.009
# Plot the WSS values for each number of clusters
plot(k.values, wss_values1, type = 'b', xlab = 'Number of Clusters', ylab = 'Total Within-Cluster Sum of Squares', main = 'Elbow Method')

# Perform k-means clustering
set.seed(1)
km_res <- kmeans(results, centers = 3, nstart = 100)
fviz_cluster(km_res, data = results) +
  scale_color_manual(values = c('steelblue3', 'sandybrown', '#9CDB9E')) +
  scale_fill_manual(values = c('steelblue3', 'sandybrown', '#9CDB9E')) +
      ggtitle("3 Cluster for result of pca") +
  theme_minimal()

Customers_Segments <- data.frame(results, cluster = as.factor(km_res$cluster))

km_res$size
#> [1]  7309 19836 13724
km_res$centers
#>          PC1         PC2          PC3
#> 1 -1.1658261 -0.08222403 -0.007465017
#> 2 -0.1858829  0.09897490 -0.011609765
#> 3  0.5382256 -0.04571766  0.024009623
df1$groupkm <- km_res$cluster

g1<- df1[df1$groupkm==1,]
g2<- df1[df1$groupkm==2,]
g3<- df1[df1$groupkm==2,]

1.5.2.1 Clusters plots for kmeans

1.5.2.1.1 Group1
### Group 1
blue_palette <- colorRampPalette(c("lightblue", "blue", "darkblue"))

colors <- blue_palette(20)
#funded_amnt
g1 %>% 
  ggplot(aes(x = funded_amnt_woe, fill = factor(funded_amnt_woe))) +
  geom_bar(color = "grey20") + guides(fill = FALSE)+
  geom_text(stat = "count", aes(label = ..count..), 
            vjust = -0.3, size = 3.5) +
  labs(title="g1_funded_amnt_woe")+
  scale_x_continuous(breaks = seq(min(g1$funded_amnt_woe), max(g1$funded_amnt_woe) , by = 0.095))+
  scale_fill_manual(values = colors[1:length(unique(g1$loan_amnt_woe))])
#> Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
#> of ggplot2 3.3.4.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
#> generated.
#> Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
#> ℹ Please use `after_stat(count)` instead.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
#> generated.

min(g1$funded_amnt_woe)
#> [1] -0.1741437
max(g1$funded_amnt_woe)
#> [1] 0.2367068
mean(g1$funded_amnt_woe)
#> [1] -0.05075121

1.5.2.2 Summary table for each kmcluster

# Create summary table for each cluster
cluster_summary <- df1 %>%
  group_by(groupkm) %>%
  summarise(
    count = n(),
    loan_amnt_min = min(loan_amnt_woe),
    loan_amnt_max = max(loan_amnt_woe),
    loan_amnt_mean = mean(loan_amnt_woe),
    funded_amnt_min = min(funded_amnt_woe),
    funded_amnt_max = max(funded_amnt_woe),
    funded_amnt_mean = mean(funded_amnt_woe),
    sub_grade_num_min = min(sub_grade_num_woe),
    sub_grade_num_max = max(sub_grade_num_woe),
    emp_length_num_min = min(emp_length_num_woe),
    emp_length_num_max = max(emp_length_num_woe),
    dti_min = min(dti_woe),
    dti_max = max(dti_woe),
    dti_mean = mean(dti_woe),
    revol_util_min = min(revol_util_woe),
    revol_util_max = max(revol_util_woe),
    revol_util_mean = mean(revol_util_woe),
  )

# Display the summary
cluster_summary <- t(cluster_summary)
print(cluster_summary)
#>                             [,1]          [,2]          [,3]
#> groupkm               1.00000000  2.000000e+00  3.000000e+00
#> count              7309.00000000  1.983600e+04  1.372400e+04
#> loan_amnt_min        -0.17833591 -1.783359e-01 -1.783359e-01
#> loan_amnt_max         0.23743139  2.374314e-01  2.374314e-01
#> loan_amnt_mean       -0.05015715 -3.948019e-02  5.741985e-02
#> funded_amnt_min      -0.17414371 -1.741437e-01 -1.741437e-01
#> funded_amnt_max       0.23670680  2.367068e-01  2.367068e-01
#> funded_amnt_mean     -0.05075121 -3.863630e-02  5.722962e-02
#> sub_grade_num_min    -0.04653398 -4.653398e-02 -4.653398e-02
#> sub_grade_num_max     0.02524326  2.524326e-02  2.524326e-02
#> emp_length_num_min   -0.02886748 -2.886748e-02 -2.886748e-02
#> emp_length_num_max    0.16199733  1.619973e-01  1.619973e-01
#> dti_min              -0.36669167 -3.666917e-01 -3.666917e-01
#> dti_max               0.54295647  5.429565e-01  5.429565e-01
#> dti_mean             -0.08493578 -3.378141e-02  3.933095e-02
#> revol_util_min       -0.54728702 -5.472870e-01 -5.472870e-01
#> revol_util_max        0.41814344  4.181434e-01  4.181434e-01
#> revol_util_mean      -0.25603657 -3.597186e-02  1.061340e-01
cluster_summary_df <- as.data.frame(cluster_summary)

###table
#grade
grade_summary <- df1 %>%
  group_by(groupkm, grade_woe) %>%
  summarise(count = n(), .groups = 'drop') %>%
  pivot_wider(names_from = grade_woe, values_from = count, values_fill = list(count = 0))
kable(grade_summary, caption = "Counts of 'grade_woe' by Group")
Counts of ‘grade_woe’ by Group
groupkm -1.12236611095412 -0.319075617367842 0.103685264830283 0.421220703081032 0.860425570044054
1 7309 0 0 0 0
2 0 12370 7051 415 0
3 0 0 3018 5999 4707
#short_emp
short_emp_summary <- df1 %>%
  group_by(groupkm, short_emp_woe) %>%
  summarise(count = n(), .groups = 'drop') %>%
  pivot_wider(names_from = short_emp_woe, values_from = count, values_fill = list(count = 0))
kable(short_emp_summary, caption = "Counts of 'short_emp_woe' by Group")
Counts of ‘short_emp_woe’ by Group
groupkm -0.0244209293279626 0.161997334092502
1 6294 1015
2 17387 2449
3 12092 1632
#purpose
purpose_summary <- df1 %>%
  group_by(groupkm, purpose_woe) %>%
  summarise(count = n(), .groups = 'drop') %>%
  pivot_wider(names_from = purpose_woe, values_from = count, values_fill = list(count = 0))
kable(purpose_summary, caption = "Counts of 'purpose_woe' by Group")
Counts of ‘purpose_woe’ by Group
groupkm -0.196831871058625 -0.196608100580901 -0.160680606700238 0.0273586138416367 0.315718834677194
1 1686 631 683 3427 882
2 4253 1029 1374 10980 2200
3 2189 610 725 8317 1883
#delinq_2yrs
delinq_2yrs_summary <- df1 %>%
  group_by(groupkm, delinq_2yrs_woe) %>%
  summarise(count = n(), .groups = 'drop') %>%
  pivot_wider(names_from = delinq_2yrs_woe, values_from = count, values_fill = list(count = 0))
kable(delinq_2yrs_summary, caption = "Counts of 'delinq_2yrs_woe' by Group")
Counts of ‘delinq_2yrs_woe’ by Group
groupkm -0.0108046746319754 0.0639753139202767
1 6860 449
2 16881 2955
3 11338 2386
kable(grade_summary, caption = "Counts of 'grade_woe' by Group")
Counts of ‘grade_woe’ by Group
groupkm -1.12236611095412 -0.319075617367842 0.103685264830283 0.421220703081032 0.860425570044054
1 7309 0 0 0 0
2 0 12370 7051 415 0
3 0 0 3018 5999 4707
kable(short_emp_summary, caption = "Counts of 'short_emp_woe' by Group")
Counts of ‘short_emp_woe’ by Group
groupkm -0.0244209293279626 0.161997334092502
1 6294 1015
2 17387 2449
3 12092 1632
kable(purpose_summary, caption = "Counts of 'purpose_woe' by Group")
Counts of ‘purpose_woe’ by Group
groupkm -0.196831871058625 -0.196608100580901 -0.160680606700238 0.0273586138416367 0.315718834677194
1 1686 631 683 3427 882
2 4253 1029 1374 10980 2200
3 2189 610 725 8317 1883
kable(delinq_2yrs_summary, caption = "Counts of 'delinq_2yrs_woe' by Group")
Counts of ‘delinq_2yrs_woe’ by Group
groupkm -0.0108046746319754 0.0639753139202767
1 6860 449
2 16881 2955
3 11338 2386